Standard Analyses

Author

Benjamin Doran

Published

December 1, 2023

Julia library setup
using DrWatson
@quickactivate projectdir()

using CSV, DataFrames, Muon
using Random: seed!
using UMAP, Distances

using LinearAlgebra, StatsBase

using StatsPlots
theme(:default, grid=false, tickdir=:out, label=false)
include(srcdir("helpers.jl"))

bbdir = datadir("exp_raw", "BB669");
tdir_16S = projectdir("_research", "BB669_16S_phyml")
tdir_bac120 = projectdir("_research", "BB669_bac120_phyml")
pdir_16S = plotsdir("BB669_16Strees") |> mkpath
pdir_bac120 = plotsdir("BB669_bac120trees") |> mkpath;
R library setup
using RCall
R"""
library(ape)
library(treeio)
library(ggtree)
library(ggplot2)
library(tidyverse)

setwd($(projectdir()))
""";

Standard analysis

This notebook gives context for what strains are in our strain bank and shows that standard analyses don’t work for capturing genomic strain variation.

# read in data
biobank = readh5mu(joinpath(bbdir, "BB669.h5mu"))
biobank.obs[!,:kept_species] = convert.(Bool,biobank.obs[!,:kept_species]);
biobank
┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/eLqpV/src/mudata.jl:351
MuData object 669 ✕ 21475
└ metabolites_foldchange
  AnnData object 669 ✕ 50
└ UPorder_oggs
  AnnData object 669 ✕ 10177
└ oggs
  AnnData object 669 ✕ 11248
# we have taxanomic information each of these strains
# GTDB_* are the annotations from https://gtdb.ecogenomic.org/
# NCBI_* are the annotation post-submission to NCBI
# * annotations are pre-submission to NCBI
# donor specifies which human microbiome each strain was sampled from
#       <hospital>.<donor number>
bbobs = biobank.obs;
first(bbobs, 5)
5×22 DataFrame
Row Strain_ID Accession Phylum Class Order Family Genus Species Donor NCBI_Phylum NCBI_Class NCBI_Order NCBI_Family NCBI_Genus NCBI_Species GTDB_Phylum GTDB_Class GTDB_Order GTDB_Family GTDB_Genus GTDB_Species kept_species
String String String String String String String String String String String String String String String String String String String String String Bool
1 MSK.16.19 JAHOLG000000000 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides eggerthii MSK.16 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides eggerthii Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides eggerthii false
2 MSK.10.5 JAHOMY000000000 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides fragilis MSK.10 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides fragilis Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides fragilis_A false
3 MSK.13.23 JAHOMK000000000 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides fragilis MSK.13 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides fragilis Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides fragilis_A false
4 MSK.16.61 JAHOKD000000000 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Phocaeicola Phocaeicola vulgatus MSK.16 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Phocaeicola Phocaeicola vulgatus Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Phocaeicola Phocaeicola vulgatus true
5 MSK.18.56 JAHOHX000000000 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides caccae MSK.18 Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides caccae Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides caccae false
# concentrated for lachno's and bacteroidaceae 
# (matches distibutions normally found in these types of strain bank)
family_counts = sort(countmap(bbobs.NCBI_Family), byvalue=true, rev=true)
OrderedCollections.OrderedDict{String, Int64} with 12 entries:
  "Lachnospiraceae"              => 347
  "Bacteroidaceae"               => 229
  "Bifidobacteriaceae"           => 30
  "Tannerellaceae"               => 21
  "Prevotellaceae"               => 14
  "Rikenellaceae"                => 9
  "Erysipelotrichaceae"          => 5
  "<unclassified>"               => 4
  "Oscillospiraceae"             => 4
  "Odoribacteraceae"             => 4
  "Enterobacteriaceae"           => 1
  "Eubacteriales incertae sedis" => 1
pie(family_counts, 
    c=palette(:Set3_12),
    legend=:outerright,
    size=(700,600),
)

proportion CSB strains from different NCBI families

species_counts = collect(values(countmap(bbobs.NCBI_Species)));
histogram(
    species_counts, 
    bins=100,
    xlabel="# of replicates",
    ylabel="# of species",
    size=(500,500),
)

each bar represents the number of measured species (y axis) with a certain number of replicates (x axis). Most species we classify only have 1 strain representative, 11 species have at least 20 strain representatives.

Phyml on 16S alignment of Biobank strains

16S is a ribosomal protein (involved with translating RNA to Proteins) that is present across all bacteria. For that reason, it is commonly used as a gene marker for constructing ancestral trees of bacteria and taxanomic classification.

What we show is that 16S annotations are only good up to species level classification. At that point all strains are completely indistinguishable from each other.

# run(`sbatch $(projectdir("scripts", "run_phyml_csb16S.jl"))`);

Triggers run of scripts/run_phyml_csb16S.jl script on SLURM compute cluster. Has the effect of running

phyml \
    -dnt -mHKY85 -fe -o tlr \
    --search SPR --r_seed 123456 --rand_start --n_rand_starts 3 \
    --no_memory_check \
    --bootstrap -4 \
    -i $PROJECTDIR/data/exp_raw/BB669/BB669_16S.phy

outputting the results in _research/BB669_16S_phyml

plot PhyML tree of 16S alignment

R"""
## read in tree and metadata
tree = read.newick(file.path($tdir_16S, "BB669_16S.phy-supporttree_multi.txt"), node.label='support')
lineagedf = read.csv(file.path($bbdir, "BB669_rowmeta.tsv"), sep="\t")
lineagedf$label = lineagedf$Strain.ID
subset_lineagedf = lineagedf[match(tree@phylo$tip.label, lineagedf$label),]
biggerfamily_ncbi = sort(table(subset_lineagedf$NCBI.Family), decreasing = TRUE)[sort(table(subset_lineagedf$NCBI.Family), decreasing = TRUE) > 0]
# labels(biggerfamily_ncbi)[[1]]
tree_withdata = as.treedata(left_join(as_tibble(tree), subset_lineagedf, by='label'))
btree_tbl = as_tibble(tree_withdata)

## Calcuate most common family desendent from each internal node of the tree
mode_family_ncbi = offspring(btree_tbl, btree_tbl$node, tiponly = FALSE, self_include = TRUE)
mode_family_ncbi = unlist(lapply(mode_family_ncbi, function(x) modeest::mfv1(x$NCBI.Family, na_rm=TRUE)))
btree_tbl$mode_family_ncbi = mode_family_ncbi
tree_withdata = as.treedata(btree_tbl)

## plot tree colored with most common family at each node
options(repr.plot.width=15, repr.plot.height=15, ignore.negative.edge=TRUE)
colorvalues = RColorBrewer::brewer.pal(12, name="Set3")
p = ggtree(tree_withdata, 
        aes(color=mode_family_ncbi, subset=support > .5), 
        layout="equal_angle", open.angle=5,
        branch.length="none", size=1.5,
    ) +
    ggtitle("CSB (PhyML) 16S") + 
    scale_color_manual(values=colorvalues, breaks=labels(biggerfamily_ncbi)[[1]]) #+theme(legend.position="none")
ggsave(file.path($pdir_16S, "CSB669_PhyML_unrooted.svg"), 
    p,
    height=10,
    width=10,
)
"""
show_svg(joinpath(pdir_16S, "CSB669_PhyML_unrooted.svg"))

R"""
# plot M. gnavus subtree
colorvalues = RColorBrewer::brewer.pal(12, name="Set3")
p = ggtree(tree_subset(tree_withdata, "MSK.5.17", levels_back = 1), 
        aes(color=Donor), 
        layout="rect", open.angle=10,
    ) + 
    ggtitle("CSB (PhyML) 16S M. gnavus subtree") + 
    theme(
        legend.position = "none", 
    ) +
    xlim(0, 4) +
    geom_tiplab(aes(label=Donor))
ggsave(file.path($pdir_16S, "CSB669_PhyML_rgnavus_16S.svg"),
    p, 
    height=7,
    width=4.5,
)
""";
show_svg(joinpath(pdir_16S, "CSB669_PhyML_rgnavus_16S.svg"))

R"""
# plot M. gnavus subtree
colorvalues = RColorBrewer::brewer.pal(12, name="Set3")
p = ggtree(tree_subset(tree_withdata, "MSK.19.84", levels_back = 3), 
        aes(color=Donor), 
        layout="rect", open.angle=10,
    ) + 
    ggtitle("CSB (PhyML) 16S B. uniformis subtree") + 
    theme(
        legend.position = "none", 
    ) +
    xlim(0, 4) +
    geom_tiplab(aes(label=Donor))
ggsave(file.path($pdir_16S, "CSB669_PhyML_buniformis_16S.svg"),
    p, 
    height=7,
    width=4.5,
)
""";
show_svg(joinpath(pdir_16S, "CSB669_PhyML_buniformis_16S.svg"))

Phyml on Bac120 alignment of Biobank strains

Bac120 aimed to be an improvement over 16S annotation. It takes 120 genes that are reasonably well conserved across bacteria, and takes ~50 genomic positions from each of those genes, creating an annotation of around 6000 features.

We find that these annotations equally only work down to species level classification, with strain level replicates all being identically annotated.

# run(`sbatch $(projectdir("scripts", "run_phyml_csbbac120.jl"))`);

Triggers run of scripts/run_phyml_csbbac120.jl script on SLURM compute cluster. Has the effect of running

phyml -daa -mLG -fe -o tlr \
    --search SPR --r_seed 123456 --rand_start --n_rand_starts 3 \
    --no_memory_check \
    --bootstrap -4 \
    -i $PROJECTDIR/data/exp_raw/BB669/BB669_bac120.phy

outputting the results in _research/BB669_bac120_phyml

plot PhyML tree of Bac120 alignment

R"""
## read in tree and metadata
tree = read.newick(file.path($tdir_bac120, "BB669_bac120.phy-supporttree_multi.txt"), node.label='support')
lineagedf = read.csv(file.path($bbdir, "BB669_rowmeta.tsv"), sep="\t")
lineagedf$label = lineagedf$Strain.ID
subset_lineagedf = lineagedf[match(tree@phylo$tip.label, lineagedf$label),]
biggerfamily_ncbi = sort(table(subset_lineagedf$NCBI.Family), decreasing = TRUE)[sort(table(subset_lineagedf$NCBI.Family), decreasing = TRUE) > 0]
# labels(biggerfamily_ncbi)[[1]]
tree_withdata = as.treedata(left_join(as_tibble(tree), subset_lineagedf, by='label'))
btree_tbl = as_tibble(tree_withdata)

## Calcuate most common family desendent from each internal node of the tree
mode_family_ncbi = offspring(btree_tbl, btree_tbl$node, tiponly = FALSE, self_include = TRUE)
mode_family_ncbi = unlist(lapply(mode_family_ncbi, function(x) modeest::mfv1(x$NCBI.Family, na_rm=TRUE)))
btree_tbl$mode_family_ncbi = mode_family_ncbi
tree_withdata = as.treedata(btree_tbl)

## plot tree colored with most common family at each node
options(repr.plot.width=15, repr.plot.height=15, ignore.negative.edge=TRUE)
colorvalues = RColorBrewer::brewer.pal(12, name="Set3")
p = ggtree(tree_withdata, 
        aes(color=mode_family_ncbi, subset=support > .5), 
        layout="equal_angle", open.angle=5,
        branch.length="none", size=1.5,
    ) +
    ggtitle("CSB (PhyML) Bac120") + 
    scale_color_manual(values=colorvalues, breaks=labels(biggerfamily_ncbi)[[1]]) #+theme(legend.position="none")
ggsave(file.path($pdir_bac120, "CSB669_PhyML_unrooted_bac120.svg"), 
    p,
    height=10,
    width=10,
)
"""
show_svg(joinpath(pdir_bac120, "CSB669_PhyML_unrooted_bac120.svg"))

# plot M. gnavus subtree
R"""
colorvalues = RColorBrewer::brewer.pal(12, name="Set3")
p = ggtree(tree_subset(tree_withdata, "MSK.5.17", levels_back = 1), 
        aes(color=Donor), 
        layout="rect", open.angle=10,
    ) + 
    ggtitle("CSB (PhyML) bac120 M. gnavus subtree") + 
    theme(
        legend.position = "none", 
    ) +
    xlim(0, 4) +
    geom_tiplab(aes(label=Donor))
ggsave(file.path($pdir_16S, "CSB669_PhyML_rgnavus_bac120.svg"),
    p, 
    height=7,
    width=4.5,
)
""";
show_svg(joinpath(pdir_16S, "CSB669_PhyML_rgnavus_bac120.svg"))

# plot M. gnavus subtree
R"""
colorvalues = RColorBrewer::brewer.pal(12, name="Set3")
p = ggtree(tree_subset(tree_withdata, "MSK.19.84", levels_back = 5), 
        aes(color=Donor), 
        layout="rect", open.angle=10,
    ) + 
    ggtitle("CSB (PhyML) bac120 B. uniformis subtree") + 
    theme(
        legend.position = "none", 
    ) +
    xlim(0, 4) +
    geom_tiplab(aes(label=Donor))
ggsave(file.path($pdir_16S, "CSB669_PhyML_buniformis_bac120.svg"),
    p, 
    height=7,
    width=4.5,
)
""";
show_svg(joinpath(pdir_16S, "CSB669_PhyML_buniformis_bac120.svg"))

Metabolite variablilty

However, these strains are not functionally identical. Strains of the same species, digest/produce different amounts of metabolite compounds. And it is not simply individual variation. In the plot below we see bi-modal distribution indicating that there are groups of strains that behave differently.

theme(:default, grid=false, tickdir=:out, label=false)
pdir_met = plotsdir("metabolite_strainvariation") |> mkpath
"/Users/bend/projects/Doran_etal_2023/plots/metabolite_strainvariation"
# load metabolites matrix
metmtx = biobank["metabolites_foldchange"].X[:, :][biobank.obs.kept_species,:];

# too close to zero for detectable measurment, so assume no change
metmtx[isinf.(metmtx)] .= 0.0; 
@show size(metmtx)

# filter to metabolites that have at least 10% detectable shifts in measurement
metabolite_names_full = biobank["metabolites_foldchange"].var_names.vals;
keepmetabolites_mask = mapslices(c->mean(c .== 0.0) < 0.9, metmtx, dims=1) |> vec;
metabolite_names = metabolite_names_full[keepmetabolites_mask]
metmtx = metmtx[:, keepmetabolites_mask];
@show size(metmtx);
size(metmtx) = (356, 50)
size(metmtx) = (356, 32)
speciescolors = CSV.read(datadir("exp_raw", "BB669", "subsettreecolors.csv"), DataFrame);
metabolitelabels = biobank["metabolites_foldchange"].var.label;
metbbobs = bbobs[biobank.obs.kept_species, :];
metabolite_species = sort(countmap(metbbobs.Species), byvalue=true, rev=true)
OrderedCollections.OrderedDict{String, Int64} with 11 entries:
  "Phocaeicola vulgatus"         => 88
  "[Ruminococcus] gnavus"        => 41
  "Bacteroides thetaiotaomicron" => 35
  "Anaerostipes hadrus"          => 31
  "Bacteroides uniformis"        => 27
  "Blautia luti"                 => 24
  "Bifidobacterium breve"        => 24
  "Coprococcus comes"            => 23
  "Dorea formicigenerans"        => 22
  "Blautia wexlerae"             => 21
  "[Eubacterium] rectale"        => 20
show plotting code
ps = []
for (i, species) in enumerate(keys(metabolite_species))
    speciesmask = occursin.(species, metbbobs.Species)
    p = violin(permutedims(1:size(metmtx, 2)),
        metmtx[speciesmask, :],
        ylabel="$(last(split(species," "))) ($(metabolite_species[species]))",
        c=speciescolors.color[findfirst(==(species), speciescolors.species_name)],
        xticks= i>=10 ? (1:size(metmtx, 2), metabolitelabels) : (1:size(metmtx, 2), repeat([""], size(metmtx, 2))),
        xrotation=90,
    )
    dotplot!(permutedims(1:size(metmtx, 2)),
        metmtx[speciesmask, :], 
        c=:black, alpha=.2, markerstrokewidth=0, markersize=1,
        # side=:right
    )
    push!(ps, p)
end

plot(ps..., 
    layout=grid(ceil(Int, length(metabolite_species)/2), 2), 
    plot_title = "CSB metabolite strain variation",
    size=(800, 800),
    tickdirection=:out,
    labelfontsize=6,
    tickfontsize=6,
    lw=.5,
    markersize=1
)

show log-2 fold change from base media across strains from 9 species with more than 20 replicates. multi-modal distributions for some distributions indicates sub-species phylogentic variation in phenotype.

savefig(joinpath(pdir_met, "violinplots_32metabolites_strainlevel_variation.pdf"))
"/Users/bend/projects/Doran_etal_2023/plots/metabolite_strainvariation/violinplots_32metabolites_strainlevel_variation.pdf"

Proforming PCA analysis

Using the orthologous gene-group annotations can we represent these strains in a way that captures functional diversity?

We find that using traditional PCA fails to create this representation. Regardless of if we subset to narrow taxanomic groups, metabolic variation is never the principle axis of variation.

theme(:default, grid=false, framestyle=:box, label="", ratio=1)

Full CSB

csb_mtx = biobank["oggs"].X[:,:]

# find variable OGGs
ogg_mask = vec(var(csb_mtx, dims=1) .> 0)
# mean center variable OGGs
meancentered_csbmtx = mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)
# svd of mean centered matrix
bbusv_csb = svd(meancentered_csbmtx);

@show size(meancentered_csbmtx);
size(meancentered_csbmtx) = (669, 8359)
# Principal Components from SVD
pcs_csb = bbusv_csb.U * Diagonal(bbusv_csb.S);
# percent of variance explained by each PC
pctvar = (bbusv_csb.S .^ 2 / sum(bbusv_csb.S .^ 2)) * 100;
map colors to NCBI families in correct order for plotting
familyid = bbobs.NCBI_Family
orderedfamilylabels = stack(DataFrame(countmap(familyid)), 1:12) |>
    df -> DataFrames.transform(df, :value => (-) => :minusvalue) |>
    df -> sort(df, [:minusvalue, :variable], rev=false) |>
    df -> df.variable
familycolors = permutedims(palette(:Set3_12).colors.colors[indexin(sort(unique(familyid)), orderedfamilylabels)]);

# collect Butyrate and Succinate values for colormap
butyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)];
succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)];

# get color limits based on most extreme value
butyrate_lims = getlims(butyrate_foldchange_per_strain)
succinate_lims = getlims(succinate_foldchange_per_strain);

The principal axis of variation across the strain bank is phylum, within each phyla there is no pattern to which strains produce/digest metabolites.

scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    group=familyid,
    alpha=1, markerstrokewidth=.3,
    size=(600,600),
    legend=:outertop,
    legendcolumns=2,
    c=familycolors,
    margin=5Plots.mm,
    ticks=:none,
    title="CSB669 (2PCs)",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=butyrate_foldchange_per_strain,
    c=:bwr, clims=butyrate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by butyrate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=succinate_foldchange_per_strain,
    c=:bwr, clims=succinate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by succinate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)
show plotting code
species_labelscounts = map(speciescolors.species_name[Not(5,6)]) do label
    cnt = occursin.(biobank.obs.Species, label) |> sum
    label, cnt
end |> x->sort(x, by=last, rev=true)

ps = []
for (label, cnt) in species_labelscounts
    mask = occursin.(biobank.obs.Species, label)
    p = plot()
    scatter!(pcs_csb[.!mask, 1], pcs_csb[.!mask, 2]; c=:lightgrey, alpha=.1, markersize=2, markerstrokewidth=0)
    scatter!(pcs_csb[mask, 1], pcs_csb[mask, 2];
        title="$label ($cnt)", 
        c=:aqua,
        alpha=1, 
        markersize=3, 
        markerstrokewidth=0, 
    )
    push!(ps, p)
end
plot(ps...;
    layout=grid(3,3),
    size=(700,700),
    supertitle="CSB669 10PCs",
    # xlabel="PC 1",
    # ylabel="PC 2",
    titlefontsize=8,
    labelfontsize=6,
    ticks=false,
    margin=5Plots.mm,

)

Each sub-plot shows 669 CSB strains on PC 1 (x axis) and PC 2 (y axis). Aqua dots correspond to strains belonging to the species listed in each sub-title followed by the number of strains belonging to that species. Remaining strains are shown in grey.

Subset to Bacteroidacae

Same story but subset to the level of Family

mask = biobank.obs.NCBI_Family .== "Bacteroidaceae";
csb_mtx = biobank["oggs"].X[:,:][mask, :]

# find variable OGGs
ogg_mask = vec(var(csb_mtx, dims=1) .> 0)
# mean center variable OGGs
meancentered_csbmtx = mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)
# svd of mean centered matrix
bbusv_csb = svd(meancentered_csbmtx);

@show size(meancentered_csbmtx);
size(meancentered_csbmtx) = (229, 3755)
# Principal Components from SVD
pcs_csb = bbusv_csb.U * Diagonal(bbusv_csb.S);
# percent of variance explained by each PC
pctvar = (bbusv_csb.S .^ 2 / sum(bbusv_csb.S .^ 2)) * 100;
map colors to NCBI Species in correct order for plotting
genusid = biobank.obs.NCBI_Genus[mask]

# collect Butyrate and Succinate values for colormap
butyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)][mask, :];
succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)][mask, :];
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    group=genusid,
    alpha=1, markerstrokewidth=.3,
    size=(600,600),
    legend=:outertop,
    legendcolumns=2,
    ticks=:none,
    title="Bacteroidaceae (2PCs)",
    xlabel="PC_1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC_2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=butyrate_foldchange_per_strain,
    c=:bwr, clims=butyrate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by butyrate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=succinate_foldchange_per_strain,
    c=:bwr, clims=succinate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by succinate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)

Subset to Phocaeicoli

Same story subset to the level of Genus

mask = biobank.obs.NCBI_Genus .== "Phocaeicola";
csb_mtx = biobank["oggs"].X[:,:][mask, :]

# find variable OGGs
ogg_mask = vec(var(csb_mtx, dims=1) .> 0)
# mean center variable OGGs
meancentered_csbmtx = mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)
# svd of mean centered matrix
bbusv_csb = svd(meancentered_csbmtx);

@show size(meancentered_csbmtx);
size(meancentered_csbmtx) = (103, 2065)
# Principal Components from SVD
pcs_csb = bbusv_csb.U * Diagonal(bbusv_csb.S);
# percent of variance explained by each PC
pctvar = (bbusv_csb.S .^ 2 / sum(bbusv_csb.S .^ 2)) * 100;
map colors to NCBI Species in correct order for plotting
speciesid = biobank.obs.NCBI_Species[mask]

# collect Butyrate and Succinate values for colormap
butyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)][mask, :];
succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)][mask, :];
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    group=speciesid,
    alpha=1, markerstrokewidth=.3,
    size=(600,600),
    legend=:outertop,
    legendcolumns=2,
    ticks=:none,
    title="Phocaeicola (2PCs)",
    xlabel="PC_1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC_2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=butyrate_foldchange_per_strain,
    c=:bwr, clims=butyrate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by butyrate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=succinate_foldchange_per_strain,
    c=:bwr, clims=succinate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by succinate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)

Subset to P. vulgatus

Same story subset to the level of an individual species.

Even here where we have introduced a large amount of constraint in the type of variation in our dataset, metabolite concentrations are not what is varying by these principal axes of variation. Meaning we can find many cases where strains are overlaid on top of each other on these axes, yet still have very different behavior either eating or producing metabolites. Similarly, on either side of the plot, we can see strains that produce/consume metabolites at similar levels.

mask = biobank.obs.NCBI_Species .== "Phocaeicola vulgatus";
csb_mtx = biobank["oggs"].X[:,:][mask, :]

# find variable OGGs
ogg_mask = vec(var(csb_mtx, dims=1) .> 0)
# mean center variable OGGs
meancentered_csbmtx = mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)
# svd of mean centered matrix
bbusv_csb = svd(meancentered_csbmtx);

@show size(meancentered_csbmtx);
size(meancentered_csbmtx) = (93, 1551)
# Principal Components from SVD
pcs_csb = bbusv_csb.U * Diagonal(bbusv_csb.S);
# percent of variance explained by each PC
pctvar = (bbusv_csb.S .^ 2 / sum(bbusv_csb.S .^ 2)) * 100;
map colors to NCBI Species in correct order for plotting
speciesid = biobank.obs.NCBI_Species[mask]

# collect Butyrate and Succinate values for colormap
butyrate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Butyrate"), biobank["metabolites_foldchange"].var.label)][mask, :];
succinate_foldchange_per_strain = biobank["metabolites_foldchange"].X[:, findfirst(==("Succinate"), biobank["metabolites_foldchange"].var.label)][mask, :];
scatter(pcs_csb[:, 1], pcs_csb[:, 2],  c=4,
    alpha=1, markerstrokewidth=.3,
    size=(600,600),
    ticks=:none,
    title="Phocaeicola (2PCs)",
    margin=5Plots.mm,
    xlabel="PC_1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC_2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=butyrate_foldchange_per_strain,
    c=:bwr, clims=butyrate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by butyrate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)
scatter(pcs_csb[:, 1], pcs_csb[:, 2],
    marker_z=succinate_foldchange_per_strain,
    c=:bwr, clims=succinate_lims,
    alpha=.5, markerstrokewidth=.3, ticks=false,
    size=(600,600), margin=5Plots.mm,
    title="colored by succinate",
    xlabel="PC 1 ($(round(pctvar[1], digits=2))%)",
    ylabel="PC 2 ($(round(pctvar[2], digits=2))%)",
)

Umap analysis of biobank strains

UMAP doesn’t help. If we use 10 principal components, and compress with UMAP into 2 dimensions we get the same story. Species are compressed into nearly single points, and the metabolite variability contained within is lost.

Calcuate PCs on full CSB
theme(:default, grid=false, framestyle=:box, label="", ratio=1)
csb_mtx = biobank["oggs"].X[:,:]

# find variable OGGs
ogg_mask = vec(var(csb_mtx, dims=1) .> 0)
# mean center variable OGGs
meancentered_csbmtx = mapslices(x->x.-mean(x), csb_mtx[:, ogg_mask], dims=1)
# svd of mean centered matrix
bbusv_csb = svd(meancentered_csbmtx);

@show size(meancentered_csbmtx);

# Principal Components from SVD
pcs_csb = bbusv_csb.U * Diagonal(bbusv_csb.S);
# percent of variance explained by each PC
pctvar = (bbusv_csb.S .^ 2 / sum(bbusv_csb.S .^ 2)) * 100;

familyid = bbobs.NCBI_Family
orderedfamilylabels = stack(DataFrame(countmap(familyid)), 1:12) |>
    df -> DataFrames.transform(df, :value => (-) => :minusvalue) |>
    df -> sort(df, [:minusvalue, :variable], rev=false) |>
    df -> df.variable
familycolors = permutedims(palette(:Set3_12).colors.colors[indexin(sort(unique(familyid)), orderedfamilylabels)]);
size(meancentered_csbmtx) = (669, 8359)
seed!(424242)
pcs_csb = bbusv_csb.U * Diagonal(bbusv_csb.S)
n_neighbors = floor(Int,3*sqrt(669))
pcs_umap_coords = umap(pcs_csb[:, 1:10]', 2; n_neighbors, metric=Euclidean())';
@show n_neighbors;
n_neighbors = 77
scatter(pcs_umap_coords[:, 1], pcs_umap_coords[:, 2],
    group=familyid,
    alpha=1, markerstrokewidth=.3,
    size=(600,600),
    legend=:outertop,
    legendcolumns=2,
    c=familycolors,
    margin=5Plots.mm,
    ticks=:none,
    title="CSB669",
    xlabel="UMAP 1",
    ylabel="UMAP 2",
)
show plotting code
species_labelscounts = map(speciescolors.species_name[Not(5,6)]) do label
    cnt = occursin.(biobank.obs.Species, label) |> sum
    label, cnt
end |> x->sort(x, by=last, rev=true)

ps = []
for (label, cnt) in species_labelscounts
    mask = occursin.(biobank.obs.Species, label)
    p = plot()
    scatter!(pcs_umap_coords[.!mask, 1], pcs_umap_coords[.!mask, 2]; c=:lightgrey, alpha=.1, markersize=2, markerstrokewidth=0)
    scatter!(pcs_umap_coords[mask, 1], pcs_umap_coords[mask, 2];
        title="$label ($cnt)", 
        c=:aqua,
        alpha=1, 
        markersize=3, 
        markerstrokewidth=0, 
    )
    push!(ps, p)
end
plot(ps...;
    layout=grid(3,3),
    size=(700,700),
    supertitle="CSB669 10PCs",
    # xlabel="PC 1",
    # ylabel="PC 2",
    titlefontsize=8,
    labelfontsize=6,
    ticks=false,
    margin=5Plots.mm,

)

Each sub-plot shows 669 CSB strains on UMAP 1 (x axis) and UMAP 2 (y axis) generated from euclidean distance across the leading 10 PCs and 77 shared nearest neighbors. Aqua dots correspond to strains belonging to the species listed in each sub-title followed by the number of strains belonging to that species. Remaining strains are shown in grey.